Smoking Paper - Main
Genus level
Data preparation
> # from Bioconductor (install by running: BiocManager::install("x") | x = name of the package)
> # BiocManager::install(c("DESeq2", "phyloseq", "microbiome"))
>
> #github functions
> # devtools::install_github("bryandmartin/corncob")
> # remotes::install_github("g-antonello/gautils")
> # remotes::install_github("mikemc/speedyseq")#
> # devtools::install_github("gaospecial/ggvenn")
>
> library(gautils)
> library(magrittr) # imported separately from tidyverse to also import '%<>%'
> library(ggpubr)
> library(kableExtra)
>
> # graphics packages
> # library(ggpubr)
> # library(magrittr)
> # # my functions
> # source("my_personal_functions.R")
> #
> #
> # library(speedyseq)
> theme_set(theme_light())
>
>
> chrismb_phy <- readRDS("/shared/statgen/microbiome/CHRISMB/processed_data/microbiome_tables/phyloseq_built.Rds")
>
> threshold_prevalence <- 0.01
> threshold_detection <- 10Download CHRIS data
These are based on data retrieval from the chrisData internal R package curated by J. Rainer. Variable selection is done via consulting the codebook.
> chrismb_phy <- readRDS("/shared/statgen/microbiome/CHRISMB/processed_data/microbiome_tables/phyloseq_built.Rds")
>
> library(chrisData)
>
> possible_data_files <- chrisDataFile()
>
> selected_data_files <- c(
+ "general_information",
+ "household",
+ "clinical_traits",
+ "interview",
+ "labdata",
+ "substudies",
+ "drugs_summary"
+ )
>
> data_selected.list <- sapply(selected_data_files, function(x) chrisData(paste0(x, ".txt.gz")), USE.NAMES = T)
>
> data_selected.joined.df <- purrr::reduce(data_selected.list, full_join, by = "AID")
> data_selected.joined.df <- data_selected.joined.df[match(chrismb_phy@sam_data$AID, data_selected.joined.df$AID),]
>
> # Columns with all NAs must be necessarily removed
> all_NAs <- apply(data_selected.joined.df, 2, function(x) all(is.na(x)))
> data_selected.joined.df <- data_selected.joined.df[, !all_NAs]
>
> # Columns of class "chron" are problematic, they must be removed to be able to work with phyloseq and metadata data.frame back and forth
> data_selected.joined_NODates.df <- select(data_selected.joined.df, -contains(c("x0bp08", "x0bc03", "x0bc06")))
>
> rm(data_selected.list)
>
> chrismb_phy <- phy_add_metadata_variables(chrismb_phy, df = data_selected.joined_NODates.df, by = "AID")
>
>
> levels(chrismb_phy@sam_data$x0oh01) <- c("0", "1-9", "10-19", "20+")
> levels(chrismb_phy@sam_data$x0sm51) <- c("Never", "Former", "Current (R)", "Current (NR)")
> chrismb_phy@sam_data$x0an03a %<>% as.numeric()
>
> chrismb_phy@sam_data$x0_ager_cat <- cut(chrismb_phy@sam_data$x0_ager, breaks = c(18, 31, 41, 51, 61, 71, max(chrismb_phy@sam_data$x0_ager)+1), include.lowest = TRUE, labels = c("18-30", "31-40", "41-50", "51-60", "61-70", "71+"))
>
> # create some househod (hid) info
> chrismb_phy <- relocate_sample_data(chrismb_phy, x0_ager_cat, .after = x0_ager) %>%
+ mutate_sample_data(
+ hid = paste0("h", hid),
+ hid_size_chrismb = add_N_to_catVar(hid) %>% as.character() %>% gsub("h.*\\ ", "", .) %>% parse_number(),
+ smoking_exposure_ga =
+ case_when(
+ grepl("Current", as.character(x0sm51)) ~ "Current",
+ TRUE ~ as.character(x0sm51)
+ ) %>%
+ factor(levels = c("Never", "Former", "Current"))
+ ) %>%
+ relocate_sample_data(smoking_exposure_ga, .before = x0sm51)Smoking variables generation
Bin smoking intensity (g/day)
> ###############################
> ## bin daily tobacco smoked
> ###############################
>
> chrismb_phy %<>%
+ mutate_sample_data(
+ ### tobacco g per day (bin 5 by 5)
+ x0sm61_bin5cont = case_when(
+ smoking_exposure_ga == "Current" & x0sm61 <= 1 ~ 1,
+ smoking_exposure_ga == "Current" & x0sm61 <= 5 ~ 5,
+ smoking_exposure_ga == "Current" & x0sm61 <= 10 ~ 10,
+ smoking_exposure_ga == "Current" & x0sm61 <= 15 ~ 15,
+ smoking_exposure_ga == "Current" & x0sm61 <= 20 ~ 20,
+ smoking_exposure_ga == "Current" & x0sm61 <= 25 ~ 25,
+ smoking_exposure_ga == "Current" & x0sm61 <= 30 ~ 30,
+ TRUE ~ NA
+ ),
+
+ x0sm61_bin2cont = case_when(
+ smoking_exposure_ga == "Current" & x0sm61 <= 2 ~ 2,
+ smoking_exposure_ga == "Current" & x0sm61 <= 4 ~ 4,
+ smoking_exposure_ga == "Current" & x0sm61 <= 6 ~ 6,
+ smoking_exposure_ga == "Current" & x0sm61 <= 8 ~ 8,
+ smoking_exposure_ga == "Current" & x0sm61 <= 10 ~ 10,
+ smoking_exposure_ga == "Current" & x0sm61 <= 12 ~ 12,
+ smoking_exposure_ga == "Current" & x0sm61 <= 14 ~ 14,
+ smoking_exposure_ga == "Current" & x0sm61 <= 16 ~ 16,
+ smoking_exposure_ga == "Current" & x0sm61 <= 18 ~ 18,
+ smoking_exposure_ga == "Current" & x0sm61 <= 20 ~ 20,
+ smoking_exposure_ga == "Current" & x0sm61 <= 22 ~ 22,
+ smoking_exposure_ga == "Current" & x0sm61 <= 24 ~ 24,
+ smoking_exposure_ga == "Current" & x0sm61 <= 26 ~ 26,
+ smoking_exposure_ga == "Current" & x0sm61 <= 28 ~ 28,
+ smoking_exposure_ga == "Current" & x0sm61 <= 30 ~ 30,
+ smoking_exposure_ga == "Current" & x0sm61 <= 60 ~ 60,
+ TRUE ~ NA
+ ))Error in `dplyr::mutate()`:
! Problem while computing `x0sm61_bin5cont = case_when(...)`.
Caused by error in `case_when()`:
Bin Years since quitting
> # x0sm40 = age when people quit or reduce
> # x0_ager = age rounded to closest integer
>
> max_value <- max(chrismb_phy@sam_data$x0_ager - chrismb_phy@sam_data$x0sm40, na.rm = T)
>
> chrismb_phy %<>%
+ mutate_sample_data(
+ years_since_quitting_or_reducing = x0_ager - x0sm40,
+
+ years_since_quitting_binned = case_when(
+ smoking_exposure_ga == "Former" & !is.na(years_since_quitting_or_reducing) ~ cut(years_since_quitting_or_reducing,
+ breaks = c(0, 1, 3, 5, 10, max_value),
+ include.lowest = TRUE
+ )
+ )
+ )
>
> rm(max_value)Ultimate smoking variable for plots: “smoking_detailed”
> max_years_since_quitting <- max(chrismb_phy@sam_data$years_since_quitting_or_reducing, na.rm = T)
>
> chrismb_phy %<>%
+ mutate_sample_data(
+ smoking_detailed = case_when(
+ smoking_exposure_ga == "Never" ~ "Never",
+
+ smoking_exposure_ga == "Former" & years_since_quitting_or_reducing <= 1 ~ "Former 0-1 y",
+ smoking_exposure_ga == "Former" & years_since_quitting_or_reducing <= 3 ~ "Former 2-3 y",
+ smoking_exposure_ga == "Former" & years_since_quitting_or_reducing <= 5 ~ "Former 4-5 y",
+ smoking_exposure_ga == "Former" & years_since_quitting_or_reducing <= 10 ~ "Former 6-10 y",
+ smoking_exposure_ga == "Former" & years_since_quitting_or_reducing <= max_years_since_quitting ~ "Former 11-62 y",
+ smoking_exposure_ga == "Former" & is.na(years_since_quitting_or_reducing) ~ "Former NA y",
+
+ smoking_exposure_ga == "Current" & x0sm61 < 2 ~ "Current < 2 g",
+ smoking_exposure_ga == "Current" & x0sm61 <= 3 ~ "Current 2-3 g",
+ smoking_exposure_ga == "Current" & x0sm61 <= 5 ~ "Current 4-5 g",
+ smoking_exposure_ga == "Current" & x0sm61 <= 10 ~ "Current 6-10 g",
+ smoking_exposure_ga == "Current" & x0sm61 <= 15 ~ "Current 11-15 g",
+ smoking_exposure_ga == "Current" & x0sm61 <= 20 ~ "Current 16-20 g",
+ smoking_exposure_ga == "Current" & x0sm61 > 20 ~ "Current > 20 g",
+ smoking_exposure_ga == "Current" & is.na(x0sm61) ~ "Current NA g",
+ TRUE ~ "Unknown"
+ ),
+
+ smoking_detailed = factor(smoking_detailed, levels = unique(smoking_detailed)[c(14, 4, 10, 6, 3, 7, 12, 8, 11, 13, 5, 9, 2, 16, 1, 15)])
+ )Available Samples for statistical analysis
> excluded_samples_upon_request <- readLines("/home/gantonello/CHRISMB/participants lists each study/withdrawnConsents_AIDs.txt")
>
> chrismb_phy_core <- core(chrismb_phy, detection = threshold_detection, prevalence = threshold_prevalence)
>
> phy_Q1 <- chrismb_phy_core %>%
+ filter_sample_data(!(AID %in% excluded_samples_upon_request),
+ !is.na(x0oh01),
+ !is.na(x0_ager),
+ !is.na(x0_sex),
+ !is.na(x0sm51),
+ #x0bc10 != 1, # people who smoked were reported as 1, NAs are treated as non-smoking
+ x0dd51 == "No")
>
> phy_Q1phyloseq-class experiment-level object
otu_table() OTU Table: [ 622 taxa and 1601 samples ]:
sample_data() Sample Data: [ 1601 samples by 1500 sample variables ]:
tax_table() Taxonomy Table: [ 622 taxa by 8 taxonomic ranks ]:
phy_tree() Phylogenetic Tree: [ 622 tips and 621 internal nodes ]:
refseq() DNAStringSet: [ 622 reference sequences ]
taxa are rows
> phy_Q1_genus <- phy_tax_glom(phy_Q1, "Genus")
>
>
> phy_Q1_meta <- meta(phy_Q1)> dir.create("../results")Table 1
Table shorter
> library(table1)
>
> tbl1 <- phy_Q1_meta %>%
+ # do some renaming, just for the Table output
+ dplyr::rename(
+ Sex = x0_sex,
+ `Age Category (years)` = x0_ager_cat,
+ `N° Teeth (self-reported)` = x0oh01,
+ `Gums Health (self-reported)` = x0oh02b
+ ) %>%
+ # pipe this temporary data into the table 1
+ table1(~ Sex + `Age Category (years)` + `N° Teeth (self-reported)` + `Gums Health (self-reported)`| x0sm51,
+ data = .,
+ extra.col = list("X² p-value" = table1.pvalues),
+ caption = "Demographics of CHRISMB cohort, in South Tyrol, Italy, with respect to smoking habit. Per-column percentages were also reported in brackets. Current smokers were separated into smokers who reduced daily smoking dosage some time in the past - Current (R), and those who did not reduce - Current (NR). The whole cohort is included under the “CHRISMB” column. Significance is calculated as X² test for categorical variables and as Kruskal-Wallis non-parametric analysis of variance for continuous variables (Blood pressure, Body Mass Index",
+ #overall = "CHRISMB"
+ )
>
> tbl1| Never (N=880) |
Former (N=395) |
Current (R) (N=100) |
Current (NR) (N=226) |
Overall (N=1601) |
X² p-value | |
|---|---|---|---|---|---|---|
| Sex | ||||||
| Male | 356 (40.5%) | 222 (56.2%) | 58 (58.0%) | 115 (50.9%) | 751 (46.9%) | 5.1e-07 |
| Female | 524 (59.5%) | 173 (43.8%) | 42 (42.0%) | 111 (49.1%) | 850 (53.1%) | |
| Age Category (years) | ||||||
| 18-30 | 238 (27.0%) | 41 (10.4%) | 38 (38.0%) | 92 (40.7%) | 409 (25.5%) | 2.2e-17 |
| 31-40 | 139 (15.8%) | 73 (18.5%) | 18 (18.0%) | 39 (17.3%) | 269 (16.8%) | |
| 41-50 | 196 (22.3%) | 75 (19.0%) | 23 (23.0%) | 41 (18.1%) | 335 (20.9%) | |
| 51-60 | 144 (16.4%) | 112 (28.4%) | 12 (12.0%) | 39 (17.3%) | 307 (19.2%) | |
| 61-70 | 93 (10.6%) | 57 (14.4%) | 8 (8.0%) | 15 (6.6%) | 173 (10.8%) | |
| 71+ | 70 (8.0%) | 37 (9.4%) | 1 (1.0%) | 0 (0%) | 108 (6.7%) | |
| N° Teeth (self-reported) | ||||||
| 0 | 50 (5.7%) | 23 (5.8%) | 7 (7.0%) | 9 (4.0%) | 89 (5.6%) | 0.11 |
| 1-9 | 57 (6.5%) | 41 (10.4%) | 4 (4.0%) | 16 (7.1%) | 118 (7.4%) | |
| 10-19 | 117 (13.3%) | 74 (18.7%) | 13 (13.0%) | 35 (15.5%) | 239 (14.9%) | |
| 20+ | 656 (74.5%) | 257 (65.1%) | 76 (76.0%) | 166 (73.5%) | 1155 (72.1%) | |
| Gums Health (self-reported) | ||||||
| Excellent | 45 (5.1%) | 18 (4.6%) | 4 (4.0%) | 12 (5.3%) | 79 (4.9%) | 0.44 |
| Very good | 188 (21.4%) | 79 (20.0%) | 20 (20.0%) | 44 (19.5%) | 331 (20.7%) | |
| Good | 291 (33.1%) | 124 (31.4%) | 30 (30.0%) | 57 (25.2%) | 502 (31.4%) | |
| Average | 229 (26.0%) | 84 (21.3%) | 27 (27.0%) | 72 (31.9%) | 412 (25.7%) | |
| Poor | 47 (5.3%) | 24 (6.1%) | 9 (9.0%) | 13 (5.8%) | 93 (5.8%) | |
| Very poor | 6 (0.7%) | 2 (0.5%) | 3 (3.0%) | 0 (0%) | 11 (0.7%) | |
| Missing | 74 (8.4%) | 64 (16.2%) | 7 (7.0%) | 28 (12.4%) | 173 (10.8%) |
Table more variables
> tbl2 <- phy_Q1_meta %>%
+ # do some renaming, just for the Table output
+ dplyr::rename(
+ Sex = x0_sex,
+ `Age Category (years)` = x0_ager_cat,
+ `N° Teeth (self-reported)` = x0oh01,
+ `Gums Health (self-reported)` = x0oh02b,
+ `Systolic Blood Pressure (mmHg)` = x0bp01,
+ `Diastolic Blood Pressure (mmHg)` = x0bp02,
+ `Body Mass Index (kg/m²)` = x0an03a
+ ) %>%
+ # pipe this temporary data into the table 1
+ table1(~ Sex + `Age Category (years)` + `N° Teeth (self-reported)` + `Gums Health (self-reported)` + `Systolic Blood Pressure (mmHg)` + `Diastolic Blood Pressure (mmHg)` + `Body Mass Index (kg/m²)`| x0sm51,
+ data = .,
+ extra.col = list("P-value" = table1.pvalues),
+ caption = "Demographics of CHRISMB cohort, in South Tyrol, Italy, with respect to smoking habit. Per-column percentages were also reported in brackets. Current smokers were separated into smokers who reduced daily smoking dosage some time in the past - Current (R), and those who did not reduce - Current (NR). The whole cohort is included under the “CHRISMB” column. Abbreviations: mmHg = millimeters of mercury, kg = kilograms, m² square meters. Significance is calculated as X² test for categorical variables and as Kruskal-Wallis non-parametric analysis of variance for continuous variables (Blood pressure, Body Mass Index",
+ #overall = "CHRISMB"
+ )
>
> tbl2| Never (N=880) |
Former (N=395) |
Current (R) (N=100) |
Current (NR) (N=226) |
Overall (N=1601) |
P-value | |
|---|---|---|---|---|---|---|
| Sex | ||||||
| Male | 356 (40.5%) | 222 (56.2%) | 58 (58.0%) | 115 (50.9%) | 751 (46.9%) | 5.1e-07 |
| Female | 524 (59.5%) | 173 (43.8%) | 42 (42.0%) | 111 (49.1%) | 850 (53.1%) | |
| Age Category (years) | ||||||
| 18-30 | 238 (27.0%) | 41 (10.4%) | 38 (38.0%) | 92 (40.7%) | 409 (25.5%) | 2.2e-17 |
| 31-40 | 139 (15.8%) | 73 (18.5%) | 18 (18.0%) | 39 (17.3%) | 269 (16.8%) | |
| 41-50 | 196 (22.3%) | 75 (19.0%) | 23 (23.0%) | 41 (18.1%) | 335 (20.9%) | |
| 51-60 | 144 (16.4%) | 112 (28.4%) | 12 (12.0%) | 39 (17.3%) | 307 (19.2%) | |
| 61-70 | 93 (10.6%) | 57 (14.4%) | 8 (8.0%) | 15 (6.6%) | 173 (10.8%) | |
| 71+ | 70 (8.0%) | 37 (9.4%) | 1 (1.0%) | 0 (0%) | 108 (6.7%) | |
| N° Teeth (self-reported) | ||||||
| 0 | 50 (5.7%) | 23 (5.8%) | 7 (7.0%) | 9 (4.0%) | 89 (5.6%) | 0.11 |
| 1-9 | 57 (6.5%) | 41 (10.4%) | 4 (4.0%) | 16 (7.1%) | 118 (7.4%) | |
| 10-19 | 117 (13.3%) | 74 (18.7%) | 13 (13.0%) | 35 (15.5%) | 239 (14.9%) | |
| 20+ | 656 (74.5%) | 257 (65.1%) | 76 (76.0%) | 166 (73.5%) | 1155 (72.1%) | |
| Gums Health (self-reported) | ||||||
| Excellent | 45 (5.1%) | 18 (4.6%) | 4 (4.0%) | 12 (5.3%) | 79 (4.9%) | 0.44 |
| Very good | 188 (21.4%) | 79 (20.0%) | 20 (20.0%) | 44 (19.5%) | 331 (20.7%) | |
| Good | 291 (33.1%) | 124 (31.4%) | 30 (30.0%) | 57 (25.2%) | 502 (31.4%) | |
| Average | 229 (26.0%) | 84 (21.3%) | 27 (27.0%) | 72 (31.9%) | 412 (25.7%) | |
| Poor | 47 (5.3%) | 24 (6.1%) | 9 (9.0%) | 13 (5.8%) | 93 (5.8%) | |
| Very poor | 6 (0.7%) | 2 (0.5%) | 3 (3.0%) | 0 (0%) | 11 (0.7%) | |
| Missing | 74 (8.4%) | 64 (16.2%) | 7 (7.0%) | 28 (12.4%) | 173 (10.8%) | |
| Systolic Blood Pressure (mmHg) | ||||||
| Mean (SD) | 126 (14.3) | 129 (15.6) | 122 (11.7) | 124 (12.0) | 126 (14.3) | 3e-05 |
| Median [Min, Max] | 124 [98.0, 203] | 128 [99.0, 188] | 121 [97.0, 157] | 122 [100, 170] | 124 [97.0, 203] | |
| Missing | 79 (9.0%) | 37 (9.4%) | 7 (7.0%) | 15 (6.6%) | 138 (8.6%) | |
| Diastolic Blood Pressure (mmHg) | ||||||
| Mean (SD) | 78.0 (8.72) | 79.6 (9.48) | 75.0 (7.62) | 76.8 (8.30) | 78.0 (8.86) | 3.3e-05 |
| Median [Min, Max] | 77.0 [58.0, 115] | 79.0 [58.0, 109] | 74.0 [59.0, 98.0] | 76.0 [58.0, 102] | 77.0 [58.0, 115] | |
| Missing | 79 (9.0%) | 37 (9.4%) | 7 (7.0%) | 15 (6.6%) | 138 (8.6%) | |
| Body Mass Index (kg/m²) | ||||||
| Mean (SD) | 25.2 (4.36) | 26.8 (4.59) | 26.0 (4.69) | 25.4 (4.17) | 25.7 (4.46) | 6.4e-08 |
| Median [Min, Max] | 24.5 [16.4, 48.3] | 26.1 [18.1, 51.3] | 25.1 [17.1, 43.0] | 25.0 [18.2, 37.4] | 25.1 [16.4, 51.3] |
Figure 1 - Microbiota in relation to smoking status
Figure 1A
> div <- phyloseq::distance(microbiome::transform(phy_Q1_genus, "compositional"), "bray")
>
> ord <- ordinate(physeq = phy_Q1_genus,
+ distance = div,
+ method = "PCoA"
+ )
>
> fig1A <- plot_ordination(physeq = phy_Q1_genus,
+ ordination = ord,
+ color = "smoking_exposure_ga")+
+ labs(title = NULL)+
+ theme_light()+
+ theme(legend.position = "bottom")+
+ scale_color_discrete(name = "Smoking habit",type = ggsci::pal_jco("default")(4)[c(1,2,4)])+
+ stat_ellipse(aes(group = smoking_exposure_ga), show.legend = FALSE)+
+ guides(color = guide_legend(override.aes = list(size=3)))
>
> fig1A$labels$caption <- gsub(fig1A$labels$caption, pattern = "\n", replacement = "; ")
> fig1A$labels$x <- paste0("Principal Coordinate 1 ", strsplit(fig1A$labels$x, " ")[[1]][[2]])
> fig1A$labels$y <- paste0("Principal Coordinate 2 ", strsplit(fig1A$labels$y, " ")[[1]][[2]])Figure 1B
> dir.create("../results/Figure1/DiffAbundGenera/", recursive = T)Diff abund 1: DESeq2
> dir.create("../results/Figure1/DiffAbundGenera/DESeq2/", recursive = T)
>
> if(!file.exists("../results/Figure1/DiffAbundGenera/DESeq2/figure1_deseq2_dataframe.list.RDS")){
+ library(DESeq2)
+ #############################################################################
+
+ deseq_object_genus <-
+ phyloseq_to_deseq2(phy_Q1_genus, design = ~ x0_sex + x0_ager_cat + x0oh01 + smoking_exposure_ga)
+
+ geom_means <- apply(counts(deseq_object_genus), 1, function(x) exp(mean(log(x[x>0]))))
+
+ # estimate size factors, important to account for library size variations
+ estim_size <- estimateSizeFactors(deseq_object_genus, geoMeans = geom_means)
+
+ # compute the DESeq2 glm, accounting for the estimate size factor and geometric means
+
+ deseq2_results_genus <- DESeq(estim_size, fitType = "local")
+
+
+ saveRDS(deseq2_results_genus, "../results/Figure1/DiffAbundGenera/DESeq2/figure1_deseq2_dataframe.list.RDS")
+ } else{
+ deseq2_results_genus <- readRDS("../results/Figure1/DiffAbundGenera/DESeq2/figure1_deseq2_dataframe.list.RDS")
+ }Diff abund 2: LinDA
Diff abund 3: Maaslin2
Diff abund 4: ALDEx2 (glm)
> dir.create("../results/Figure1/DiffAbundGenera/ALDEx2/", recursive = T)
>
> if (!file.exists("../results/Figure1/DiffAbundGenera/ALDEx2/ALDEx2.DA.rawResults_genera.list.RDS")) {
+
+ library(ALDEx2)
+ # define model matrix, based on the metadata
+ model_mtx <-
+ model.matrix(
+ ~ x0_sex + x0_ager_cat + x0oh01 + smoking_exposure_ga,
+ data = dplyr::select(
+ meta(phy_Q1_genus),
+ x0oh01,
+ x0_sex,
+ x0_ager_cat,
+ smoking_exposure_ga
+ )
+ )
+ # perform the clr differential abundance
+ aldex.glm.genera_smoking_ga <-
+ aldex.clr(abundances(phy_Q1_genus),
+ model_mtx,
+ mc.samples = 200,
+ denom = "all",
+ useMC = T
+ )
+
+ # make glm, obtain significance of terms
+ glm.test_genera_smoking_ga <- aldex.glm(aldex.glm.genera_smoking_ga, model_mtx)
+
+ # this step gets estimates effect sizes, it takes ages, like 30-60 minutes
+ glm.effects_genera_smoking_ga <- aldex.glm.effect(aldex.glm.genera_smoking_ga, include.sample.summary = T, verbose = FALSE, useMC = T)
+
+ # aggregate these results into one data frame, that looks like lme4 output
+ aldex.results.genera <- list(clr.model = aldex.glm.genera_smoking_ga,
+ glm.test = glm.test_genera_smoking_ga,
+ eff_size.test = glm.effects_genera_smoking_ga)
+
+
+ saveRDS(aldex.results.genera, "../results/Figure1/DiffAbundGenera/ALDEx2/ALDEx2.DA.rawResults_genera.list.RDS")
+
+ } else {
+ aldex.results.genera <- readRDS("../results/Figure1/DiffAbundGenera/ALDEx2/ALDEx2.DA.rawResults_genera.list.RDS")
+ }Diff abund 5: ANCOM-BC
> dir.create("../results/Figure1/DiffAbundGenera/ANCOM-BC/", recursive = T)
>
> if (!file.exists("../results/Figure1/DiffAbundGenera/ANCOM-BC/ANCOM-BC.DA.rawResults_genera.list.RDS")) {
+ tmp <- mia::makeTreeSummarizedExperimentFromPhyloseq(phy_Q1_genus %>% select_sample_data(x0_sex, x0_ager_cat, x0oh01, smoking_exposure_ga))
+
+
+ library(ANCOMBC)
+
+ # takes around 30 minutes
+ ancom_results <- ancombc2(data = tmp,
+ assay_name = "counts",
+ tax_level = NULL,
+ fix_formula = "x0_sex + x0_ager_cat + x0oh01 + smoking_exposure_ga",
+ group = "smoking_exposure_ga",
+ struc_zero = TRUE,
+ iter_control = list(tol = 1e-5, max_iter = 20, verbose = FALSE),
+ em_control = list(tol = 1e-5, max_iter = 20),
+ p_adj_method = "BH",
+ n_cl = 1,
+ alpha = 0.05)
+
+ saveRDS(ancom_results, "../results/Figure1/DiffAbundGenera/ANCOM-BC/ANCOM-BC.DA.rawResults_genera.list.RDS")
+ detach("package:ANCOMBC", unload = T)
+ }else{
+
+ ancom_results <- readRDS("../results/Figure1/DiffAbundGenera/ANCOM-BC/ANCOM-BC.DA.rawResults_genera.list.RDS")
+
+ }Summary of results
Results filtered by two conditions:
- FDR (5%) corrected \(q-value < 0.05\)
- The effect size, irrespectively of the way it is estimated, must be larger than 0.5
Since this value is rather arbitrary, one could devise a quantile cutoff, but it’s hard to decide where to cut, since we have no a priori expectations of the effect size distributions.
Consensus of methods, quick discussion
> ggarrange(ggarrange(plot_correlations_eff.sizes,
+ venn_plot+
+ scale_x_continuous(expand = expansion(mult = .2)),
+
+ nrow = 2, ncol = 1, heights = c(1,0.5)
+ ),
+
+ heatmap_plot,
+
+ nrow = 1, ncol = 2
+ ) %>%
+ annotate_figure(top = text_grob("Discussion of differential abundance\nresults similarity across methods", size = 16, face = "bold", color = "red2"),
+ bottom = text_grob(paste("NAs in the heatmap were arbitrarily set to", NA_value, "in order not to fail clustering,\nwhile still keeping evident track of where a method did not return a pathway as significant"), size = 10))Figure 1C - Oxygen Metabolism
> dir.create("../results/Figure1/OxygenMetabolism/", recursive = T)
>
> otutab_with_genus_name <- phy_Q1_genus %>%
+ microbiome::transform("compositional") %>%
+ abundances() %>%
+ as.data.frame() %>%
+ rownames_to_column("Genus") %>% # next step helps assigning the highest number of genera/families possible (because among the taxa in the table there are some families too)
+ mutate(Genus_easier = sapply(strsplit(as.character(Genus), "_"), function(g) g[[1]]))
>
> # get the metadata of the oxygen metabolism vs Genus
> oxygen_requirement_taxa <- read.csv("https://raw.githubusercontent.com/mcalgaro93/sc2meta/master/data/genera_methabolism.tsv", sep = "\t") %>%
+ set_names(c("Genus_easier", "oxygen_requirements")) %>%
+ mutate(oxygen_requirements = gsub("F ", "Facultative ", oxygen_requirements, fixed = T))
>
> # merge otu table and oxygen requirement data by simplified genus name
> molten_df_merged_oxygen <- merge(otutab_with_genus_name, oxygen_requirement_taxa, by = "Genus_easier", all.x = TRUE)
> molten_df_merged_oxygen$oxygen_requirements <- ifelse(is.na(molten_df_merged_oxygen$oxygen_requirements), "NA",molten_df_merged_oxygen$oxygen_requirements)
>
> # aggregate by oxygen metabolism, instead of genera, to make code easier
>
> tmp <- molten_df_merged_oxygen %>%
+ dplyr::select(!contains("Genus")) %>%
+ group_by(oxygen_requirements) %>%
+ summarise_all(sum) %>%
+ column_to_rownames("oxygen_requirements") %>%
+ t() %>% as.data.frame() %>%
+ rownames_to_column("sample_name")
>
> oxygen_molten_metadata <- tmp %>% reshape2::melt() %>%
+ merge(., phy_Q1_meta, all.x = TRUE, by = "sample_name") %>%
+ dplyr::rename(oxygen = variable)
>
> saveRDS(oxygen_molten_metadata, "../results/Figure1/OxygenMetabolism/oxygen_with_metadata_df.RDS")> library(rstatix)
>
> signif <- filter(oxygen_molten_metadata, oxygen != "NA") %>%
+ mutate(aerobiosis = oxygen) %>%
+ group_by(oxygen) %>%
+ rstatix::wilcox_test(value ~ smoking_exposure_ga) %>%
+ adjust_pvalue(method = "BH") %>%
+ #add_significance("p.adj") %>%
+ add_xy_position(x = "oxygen",step.increase = 0.05) %>%
+ filter(p.adj < 0.05)
>
> fig1C <- ggboxplot(data = filter(oxygen_molten_metadata, oxygen != "NA"),
+ x = "oxygen",
+ y = "value",
+ fill = "smoking_exposure_ga") +
+ ylab("Relative Abundance") +
+ xlab("")+
+ scale_fill_discrete(name = "Smoking habit", type = ggsci::pal_jco()(4)[c(1,2,4)]) +
+ stat_pvalue_manual(signif, tip.length = 0.01) +
+ theme_light()+
+ theme(legend.position = "none")Figure 1 Panel
> Fig1_ggpanel <- ggarrange(
+ ggarrange(fig1A,
+ fig1C,
+
+ nrow = 2, ncol = 1, labels = c("A","C"), heights = c(1,0.8)
+ ),
+ ggplot() + theme_void(),
+ fig1B,
+
+ nrow = 1, ncol = 3, labels = c("", "B", ""), widths = c(1,0.1,1)
+ )
>
> ggsave(
+ plot = Fig1_ggpanel,
+ filename = "../results/Figure1/Figure 1.png",
+ height = 7,
+ width = 8,
+ units = "in",
+ dpi = 600
+ )
>
> Fig1_ggpanelFigure 2 - Current smokers, regression vs dosage per day
> phy_Q2 <- phy_Q1 %>%
+ filter_sample_data(smoking_exposure_ga == "Current") %>%
+ filter_sample_data(!is.na(x0sm61)) %>%
+ filter_sample_data(x0sm61 < 60)
>
> phy_Q2 <- subset_taxa(phy_Q2, taxa_sums(phy_Q2)>0)
>
> phy_Q2_genus <- phy_tax_glom(phy_Q2, "Genus")
>
> dir.create("../results/Figure2", recursive = TRUE, showWarnings = FALSE)
>
> phy_Q2_genusphyloseq-class experiment-level object
otu_table() OTU Table: [ 82 taxa and 308 samples ]:
sample_data() Sample Data: [ 308 samples by 1500 sample variables ]:
tax_table() Taxonomy Table: [ 82 taxa by 8 taxonomic ranks ]:
phy_tree() Phylogenetic Tree: [ 82 tips and 81 internal nodes ]:
refseq() DNAStringSet: [ 82 reference sequences ]
taxa are rows
Regression
> if(!file.exists("../results/Figure2/deseq_smoking_regression_datafames.rds")){
+ deseq_raw_results_Q2 <- list()
+ for(gr in c("x0sm61_bin5cont", "x0sm61_bin2cont", "x0sm61")){
+
+ # create deseq object
+ fla <- as.formula(paste0("~ x0_ager + x0_sex + x0oh01 + ", gr))
+
+ phy_q2_genus_deseq_q2 <- phyloseq_to_deseq2(phy_Q2_genus, design = fla)
+
+ # calculate geometric means, important to center the counts
+
+ geom_means_q2 <- apply(counts(phy_q2_genus_deseq_q2), 1, function(x)
+ exp(mean(log(x[x > 0]))))
+
+ # estimate size factors, important to account for library size variations
+ estim_size_q2 <- estimateSizeFactors(phy_q2_genus_deseq_q2, geoMeans = geom_means_q2)
+
+ # compute the DESeq2 glm, accounting for the estimate size factor and geometric means
+
+ deseq2_results_q2 <- DESeq(estim_size_q2, fitType="local")
+ deseq_raw_results_Q2[[gr]] <- deseq2_results_q2
+ }
+
+ taxonomy <- as.data.frame(tax_table(phy_Q2_genus))[c("Genus", "Phylum")]
+ q2_results <- lapply(deseq_raw_results_Q2, function(res) results(res) %>%
+ as.data.frame() %>%
+ rownames_to_column("Genus") %>%
+ merge(taxonomy, by = "Genus", sort = FALSE, all.x = TRUE)
+ )
+
+ saveRDS(q2_results, "../results/Figure2/deseq_smoking_regression_datafames.rds")
+ rm(taxonomy)
+
+ } else{
+
+ q2_results <- readRDS("../results/Figure2/deseq_smoking_regression_datafames.rds")
+ }Figure 2A - Heatmap
Figure 2 B,C,D
> Fig2BCD <-
+ filter(
+ oxygen_molten_metadata,
+ oxygen != "NA" &
+ smoking_exposure_ga == "Current",
+ x0sm61 < 60
+ ) %>%
+ group_split(oxygen) %>%
+ lapply(
+ function(ox)
+ ggscatter(ox,
+ x = "x0sm61",
+ y = "value",
+ alpha = 0.5,
+ color = "gray30") +
+ geom_smooth(se = TRUE) +
+ theme_light()+
+ labs(
+ x = "Tobacco smoked per day (g)",
+ y = "Relative Abundance",
+ title = unique(ox$oxygen)
+ )
+ )> filter(
+ oxygen_molten_metadata,
+ oxygen == "Aerobic" &
+ smoking_exposure_ga == "Current",
+ x0sm61 < 60
+ ) %>%
+ lm(value ~ I(1/x0sm61) + x0_ager + x0_sex + x0oh01, data = .) %>%
+ summary() %>%
+ broom::tidy() %>%
+ mutate(term = c("Intercept", "1/tobacco exposure intensity (g/day)", "Age", "Sex", "1-9 teeth", "10-19 teeth", "20+ teeth")) %>%
+ kbl(caption = "The relative abundance of AEROBIC bacteria decreases with increasing grams of tobacco smoked in a day, up to 10 g/day. After 10 grams, it is no longer statistically associated (data not shown)", digits = 3) %>%
+ kable_styling()| term | estimate | std.error | statistic | p.value |
|---|---|---|---|---|
| Intercept | -0.007 | 0.012 | -0.598 | 0.550 |
| 1/tobacco exposure intensity (g/day) | 0.027 | 0.008 | 3.545 | 0.000 |
| Age | 0.000 | 0.000 | 1.579 | 0.115 |
| Sex | 0.005 | 0.004 | 1.322 | 0.187 |
| 1-9 teeth | 0.004 | 0.011 | 0.331 | 0.741 |
| 10-19 teeth | 0.024 | 0.009 | 2.571 | 0.011 |
| 20+ teeth | 0.024 | 0.009 | 2.645 | 0.009 |
Figure 2 - Panel
> # Fig2_ggpanel <- ggarrange(
> # ggplot()+ theme_void(),
> # ggplotify::as.ggplot(Fig2A),
> # ggarrange(
> # Fig2BCD[[1]],
> # Fig2BCD[[2]],
> # Fig2BCD[[3]],
> #
> # nrow = 3, ncol = 1, labels = c("B", "C", "D")
> # ),
> #
> # nrow = 1, ncol = 3, widths = c(0.1, 1.5,1), labels = c("A", "", "")
> # )
>
> Fig2_ggpanel <- ggarrange(
+ #ggplot()+ theme_void(),
+ ggplotify::as.ggplot(Fig2A),
+ ggarrange(
+ Fig2BCD[[1]],
+ Fig2BCD[[2]],
+ Fig2BCD[[3]],
+
+ nrow = 3, ncol = 1, labels = c("B", "C", "D")
+ ),
+
+ nrow = 1, ncol = 2,
+ widths = c(1.5,1),
+ labels = c("A", "")
+ )
>
>
> ggsave(
+ plot = Fig2_ggpanel,
+ filename ="../results/Figure2/Figure 2.png",
+ height = 7,
+ width = 8,
+ units = "in",
+ dpi = 600
+ )
>
> Fig2_ggpanelFigure 3 - Former smokers, regression vs years since quitting
> dir.create("../results/Figure3/")> phy_Q3 <- filter_sample_data(phy_Q1,
+ smoking_exposure_ga == "Former",
+ !is.na(years_since_quitting_or_reducing),
+ x0sm40 >= x0sm34 # meaning: (age when they quit) >= (age when they started)
+ )
>
> phy_Q3 <- subset_taxa(phy_Q3, taxa_sums(phy_Q3) > 0)
>
> phy_Q3@sam_data$years_since_quitting_or_reducing %<>% as.numeric()
>
> phy_Q3_genus <- phy_tax_glom(phy_Q3, "Genus")
>
> phy_Q3phyloseq-class experiment-level object
otu_table() OTU Table: [ 622 taxa and 369 samples ]:
sample_data() Sample Data: [ 369 samples by 1500 sample variables ]:
tax_table() Taxonomy Table: [ 622 taxa by 8 taxonomic ranks ]:
phy_tree() Phylogenetic Tree: [ 622 tips and 621 internal nodes ]:
refseq() DNAStringSet: [ 622 reference sequences ]
taxa are rows
Regression calculations and summary table
> library(DESeq2)
>
> phy_q3_genus_deseq <- phyloseq_to_deseq2(phy_Q3_genus, design = ~ x0_ager_cat + x0_sex + x0oh01 + years_since_quitting_or_reducing)
>
> # calculate geometric means, important to center the counts
> geom_means_q3 <- apply(counts(phy_q3_genus_deseq), 1, function(x)
+ exp(mean(log(x[x > 0]))))
>
> # estimate size factors, important to account for library size variations
> estim_size_q3 <- estimateSizeFactors(phy_q3_genus_deseq, geoMeans = geom_means_q3)
>
> # compute the DESeq2 glm, accounting for the estimate size factor and geometric means
>
> deseq2_results_q3 <- DESeq(estim_size_q3, fitType="local")
>
> results_q3 <- results(deseq2_results_q3, name = "years_since_quitting_or_reducing")
>
> results_q3.df <- as.data.frame(results_q3)
>
> DT::datatable(arrange(results_q3.df, padj) %>% mutate_if(is.numeric, round, 3), caption = "Table 'Years since quitting' | Results of regression of microbial genera in response to the years since quitting smoking. The model was corrected for age, sex and number of teeth, nominal P-values correceted for False Discovery Rate with the Benjamini-Hochberg method.")Figure 3A - Heatmap
Figure 3 B,C,D
> fig3BCD <-
+ filter(
+ oxygen_molten_metadata,
+ oxygen != "NA" &
+ sample_name %in% sample_names(phy_Q3_genus) &
+ years_since_quitting_or_reducing <= 40
+ ) %>%
+ group_split(oxygen) %>%
+ lapply(
+ function(ox)
+ ggscatter(ox,
+ x = "years_since_quitting_or_reducing",
+ y = "value",
+ alpha = 0.5
+ ) +
+ geom_smooth(se = TRUE) +
+ xlab("Years since quitting smoking") +
+ ylab("Relative Abundance") +
+ ggtitle(unique(ox$oxygen)) +
+
+ theme_light()+geom_point(alpha = 0.1, color = "gray50")
+ )> filter(
+ oxygen_molten_metadata,
+ oxygen == "Aerobic" &
+ sample_name %in% sample_names(phy_Q3_genus) &
+ years_since_quitting_or_reducing <= 20) %>%
+ lm(value ~ years_since_quitting_or_reducing + x0_ager + x0_sex + x0oh01, data = .) %>%
+ summary() %>%
+ broom::tidy() %>%
+ mutate(term = c("Intercept", "Years since quitting", "Age", "Sex", "1-9 teeth", "10-19 teeth", "20+ teeth")) %>%
+ kbl(caption = "The relative abundance of AEROBIC bacteria increases with increasing years since quitting, up to 20 years. After 20 years, it is no longer statistically associated (data not shown)", digits = 3) %>%
+ kable_styling()| term | estimate | std.error | statistic | p.value |
|---|---|---|---|---|
| Intercept | 0.024 | 0.024 | 0.995 | 0.321 |
| Years since quitting | 0.001 | 0.001 | 1.944 | 0.053 |
| Age | 0.000 | 0.000 | 1.022 | 0.308 |
| Sex | 0.002 | 0.006 | 0.359 | 0.720 |
| 1-9 teeth | 0.029 | 0.017 | 1.703 | 0.090 |
| 10-19 teeth | 0.025 | 0.017 | 1.494 | 0.137 |
| 20+ teeth | 0.020 | 0.016 | 1.254 | 0.211 |
> filter(
+ oxygen_molten_metadata,
+ oxygen == "Anaerobic" &
+ sample_name %in% sample_names(phy_Q3_genus) &
+ years_since_quitting_or_reducing <= 20) %>%
+ lm(value ~ years_since_quitting_or_reducing + x0_ager + x0_sex + x0oh01, data = .) %>%
+ summary() %>%
+ broom::tidy() %>%
+ mutate(term = c("Intercept", "Years since quitting", "Age", "Sex", "1-9 teeth", "10-19 teeth", "20+ teeth")) %>%
+ kbl(caption = "The relative abundance of ANAEROBIC bacteria increases with increasing years since quitting, up to 20 years. After 20 years, it is no longer statistically associated (data not shown)", digits = 3) %>%
+ kable_styling()| term | estimate | std.error | statistic | p.value |
|---|---|---|---|---|
| Intercept | 0.704 | 0.050 | 14.095 | 0.000 |
| Years since quitting | -0.001 | 0.001 | -0.691 | 0.491 |
| Age | -0.001 | 0.001 | -2.212 | 0.028 |
| Sex | -0.016 | 0.012 | -1.309 | 0.192 |
| 1-9 teeth | -0.025 | 0.036 | -0.685 | 0.494 |
| 10-19 teeth | -0.003 | 0.035 | -0.090 | 0.929 |
| 20+ teeth | 0.011 | 0.034 | 0.313 | 0.755 |
Figure 3 - Panel
> Fig3_ggpanel <- ggarrange(
+ ggplotify::as.ggplot(fig3A),
+ ggarrange(fig3BCD[[1]], fig3BCD[[2]], fig3BCD[[3]],
+
+ nrow = 3, ncol = 1, labels = c("B", "C", "D")
+ ),
+
+ nrow = 1, ncol = 2, widths = c(1.5,1), labels = c("A", "")
+ )
>
>
> ggsave(
+ plot = Fig3_ggpanel,
+ filename = "../results/Figure3/Figure 3.png",
+ height = 7,
+ width = 8,
+ units = "in",
+ dpi = 600
+ )
>
> Fig3_ggpanelFigure 4 - Pathways differential abundance
PICRUSt2 input data were generated using the full CHRISMB dataset filtered for prevalence and detection (resp. 1% and 10 counts), which had 623 features for 1928 samples. I though I would do this for all samples once and for all, the set of ASVs doesn’t change if I pick the smoking paper subset or the CHRISMB full one. The discriminative part is the set of ASVs.
Results are stored in /home/gantonello/CHRISMB/PICRUSt2_prediction_v2.5/ subdivided into stratified and unstratified. the R script to generate the input data is in each of those two base directories.
Then I imported the two informative datasets:
- EC codes per sample (like looking at single enzyme level)
- pathway abundances (like looking at the aggregated enzymes)
HOW DOES PICRUSt2 ASSIGN EC AND PATHWAYS? It has some mapping files inside the environment.
If you run the picrust_pipeline.py with default settings, PICRUSt2 will take information from:
/home/gantonello/miniconda3/envs/picrust2/lib/python3.8/site-packages/picrust2/
And from there it will:
- Assign the ASVs reference seqs to the closest bacterial genome
- If the distance from the closest annotated reference genome is less than 2, it assumes identity and will assign a set of potential functions using
./default_files/prokaryotic/ec.txt.gz - EC numbers are converted into enzyme names with
./default_files/pathway_mapfiles/ec_level4_to_metacyc_rxn.tsv - these enzyme names are assigned to pathway using
./default_files/pathway_mapfiles/metacyc_path2rxn_struc_filt_pro.txt
Then the names get nicer names with a simple, final function, and that’s what I imported for the analysis.
Let’s start with pathways, correcting for age category, sex, and N° teeth, as the rest of the paper draft
> dir.create("../results/Figure4/", showWarnings = F)> picrust_pathways_matrix <- data.table::fread("/home/gantonello/CHRISMB/PICRUSt2_prediction_v2.5/unstratified/picrust2_output/pathways_out/path_abun_unstrat_descrip.tsv.gz") %>%
+ dplyr::select(-pathway) %>%
+ column_to_rownames("description") %>%
+ as.matrix()
>
> picrust_pathways_matrix %<>% round(0)
>
> picrust_pathways_matrix_Q1 <- picrust_pathways_matrix[!grepl("super", rownames(picrust_pathways_matrix)),sample_names(phy_Q1)]
>
> #create a map data frame for pathway names
>
> pathway_names_map_ALL <- data.frame(original_name = rownames(picrust_pathways_matrix_Q1),
+ fixed_name = make.names(rownames(picrust_pathways_matrix_Q1)))Diff abund 1: DESeq2
> dir.create("../results/Figure4/DiffAbundGenera/DESeq2/", recursive = T)
>
> if(!file.exists("../results/Figure4/DiffAbundGenera/DESeq2/figure4_deseq2_dataframe.list.RDS")){
+
+ library(DESeq2)
+ #############################################################################
+
+ deseq_object_picrust <- phyloseq(otu_table(picrust_pathways_matrix_Q1, taxa_are_rows = TRUE), sample_data(phy_Q1)) %>%
+ core(detection = 10, prevalence = 0.05) %>%
+ phyloseq_to_deseq2(design = ~ x0_sex + x0_ager_cat + x0oh01 + smoking_exposure_ga)
+
+ geom_means <- apply(counts(deseq_object_picrust), 1, function(x) exp(mean(log(x[x>0]))))
+
+ # estimate size factors, important to account for library size variations
+ estim_size <- estimateSizeFactors(deseq_object_picrust, geoMeans = geom_means)
+
+ # compute the DESeq2 glm, accounting for the estimate size factor and geometric means
+
+ deseq2_results_picrust <- DESeq(estim_size, fitType = "local")
+
+ saveRDS(deseq2_results_picrust, "../results/Figure4/DiffAbundGenera/DESeq2/figure4_deseq2_dataframe.list.RDS")
+ } else{
+ deseq2_results_picrust <- readRDS("../results/Figure4/DiffAbundGenera/DESeq2/figure4_deseq2_dataframe.list.RDS")
+ }Diff abund 2: LinDA
Diff abund 3: Maaslin2
Diff abund 4: ALDEx2 (glm)
> dir.create("../results/Figure4/DiffAbundGenera/ALDEx2/", recursive = T)
>
> if (!file.exists("../results/Figure4/DiffAbundGenera/ALDEx2/ALDEx2.DA.rawResults.list.RDS")) {
+
+ library(ALDEx2)
+
+ model_mtx <-
+ model.matrix(
+ ~ x0_sex + x0_ager_cat + x0oh01 + smoking_exposure_ga,
+ data = dplyr::select(
+ phy_Q1_meta,
+ x0oh01,
+ x0_sex,
+ x0_ager_cat,
+ smoking_exposure_ga
+ ) %>% .[intersect(colnames(picrust_pathways_matrix_Q1),
+ rownames(phy_Q1_meta)), ]
+ )
+
+ aldex.glm.pathway_smoking_ga <-
+ picrust_pathways_matrix_Q1[, intersect(colnames(picrust_pathways_matrix_Q1),
+ rownames(phy_Q1_meta))] %>%
+ aldex.clr(
+ model_mtx,
+ mc.samples = 150,
+ denom = "all",
+ useMC = T
+ )
+
+ glm.test_pathway_smoking_ga <-
+ aldex.glm(aldex.glm.pathway_smoking_ga, model_mtx)
+
+ glm.effects_pathway_smoking_ga <-
+ aldex.glm.effect(aldex.glm.pathway_smoking_ga, verbose = FALSE, include.sample.summary = T, useMC = T)
+
+ aldex.results <- list(clr.model = aldex.glm.pathway_smoking_ga,
+ glm.test = glm.test_pathway_smoking_ga,
+ eff_size.test = glm.effects_pathway_smoking_ga)
+
+
+ saveRDS(aldex.results, "../results/Figure4/DiffAbundGenera/ALDEx2/ALDEx2.DA.rawResults.list.RDS")
+
+ detach("package:ALDEx2", unload = T)
+ detach("package:zCompositions", unload = T)
+
+ } else{
+ aldex.results <- readRDS("../results/Figure4/DiffAbundGenera/ALDEx2/ALDEx2.DA.rawResults.list.RDS")
+ }Diff abund 5: ANCOM-BC
> dir.create("../results/Figure4/DiffAbundGenera/ANCOM-BC/", recursive = T)
>
> if (!file.exists("../results/Figure4/DiffAbundGenera/ANCOM-BC/ANCOM-BC.DA.rawResults.list.RDS")) {
+ tmp <- phyloseq(otu_table(picrust_pathways_matrix_Q1, taxa_are_rows = T), sample_data(phy_Q1_genus %>%
+ select_sample_data(x0_sex, x0_ager_cat, x0oh01, smoking_exposure_ga) %>% meta())) %>%
+ select_sample_data(x0_sex, x0_ager_cat, x0oh01, smoking_exposure_ga) %>%
+ core(detection = 10, prevalence = 0.05) %>%
+ mia::makeTreeSummarizedExperimentFromPhyloseq()
+
+
+ library(ANCOMBC)
+
+ ancom_results_picrust <- ancombc2(data = tmp,
+ assay_name = "counts",
+ tax_level = NULL,
+ fix_formula = "x0_sex + x0_ager_cat + x0oh01 + smoking_exposure_ga",
+ group = "smoking_exposure_ga",
+ struc_zero = TRUE,
+ iter_control = list(tol = 1e-5, max_iter = 20, verbose = FALSE),
+ em_control = list(tol = 1e-5, max_iter = 20),
+ p_adj_method = "BH",
+ n_cl = 1,
+ alpha = 0.05)
+
+ saveRDS(ancom_results_picrust, "../results/Figure4/DiffAbundGenera/ANCOM-BC/ANCOM-BC.DA.rawResults.list.RDS")
+
+ }else{
+
+ ancom_results_picrust <- readRDS("../results/Figure4/DiffAbundGenera/ANCOM-BC/ANCOM-BC.DA.rawResults.list.RDS")
+
+ }Summary of results
Results filtered by two conditions:
- FDR (5%) corrected \(q-value < 0.05\)
- The effect size, irrespectively of the way it is estimated, must be larger than 0.5
- Relative Abundance
- Mean per smoking group
- Z-scaled for plotting
> if(!exists("pathways_consensus")){
+ pathways_consensus <- readRDS("../results/Figure4/significant_PWYs_consensus.RDS")
+ }
>
> dummy_tax_table <- data.frame(pathway = rownames(picrust_pathways_matrix_Q1)) %>% set_rownames(rownames(picrust_pathways_matrix_Q1)) %>% as.matrix()
>
>
> pathways_summarized_plot <- phyloseq(otu_table(picrust_pathways_matrix_Q1, taxa_are_rows = T),
+ sample_data(phy_Q1_meta), tax_table(dummy_tax_table)) %>%
+ phy_transform("compositional") %>%
+ filter_sample_data(sample_name %in% c(sample_names(phy_Q2), sample_names(phy_Q3)) | smoking_exposure_ga == "Never") %>%
+ filter_tax_table(taxa_names(.) %in% pathways_consensus) %>%
+ mutate_sample_data(smoking_detailed = add_N_to_catVar(smoking_detailed)) %>%
+ phy_summarise_taxa_by_metadata(metadata_var = "smoking_detailed", .fun = mean)
>
> ggsave(plot = ggplotify::as.ggplot(pheatmap::pheatmap(pathways_summarized_plot, scale = "none", main = "Unscaled Rows", cluster_cols = FALSE, angle_col = 315)),
+ filename = "../results/Figure4/Figure 4 picrust_unscaledRows.png", dpi = 600, height = 6, width = 8)> # save also scaled rows for potential alternative use
>
> ggsave(plot = ggplotify::as.ggplot(pheatmap::pheatmap(pathways_summarized_plot, scale = "row", main = "Scaled Rows", cluster_cols = FALSE, angle_col = 315)),
+ filename = "../results/Figure4/Figure4 picrust_scaledRows.png", dpi = 600, height = 6, width = 8)Interesting note, the CLR transformation is dependent somehow on the number of features in the table, meaning that if you subset before creating the phyloseq object to make the transformation, you end up with a correct transformation, but scaled only on those pathways you selected. In my case, this approach was masking a few signals, so my advice is:
- create a phyloseq object with all features (taxa, pathways, Kegg Orthology enzymes …) (optional, I use it because the transformaton functions are implemented)
- re-convert into
data.framewith eitherspeedyseq::psmelt()orphy_OtuMetaTable() - make all the plots you want
Notice that many pathways are actually highly correlated, like the quinols. let’s see this in a heatmap
> signif_pathways_names <- readRDS("../results/Figure4/significant_PWYs_consensus.RDS")
>
> picrust_pathways_matrix_Q1[signif_pathways_names,] %>%
+ apply(2, function(s) s/sum(s)) %>%
+ t() %>%
+ stats::cor(method = "spearman") %>%
+ pheatmap::pheatmap(show_colnames = FALSE, main = "Spearman correlation between significant pathways")> ggarrange(ggarrange(plot_correlations_eff.sizes,
+ venn_plot+
+ scale_x_continuous(expand = expansion(mult = .2)),
+
+ nrow = 2, ncol = 1, heights = c(1,0.5)
+ ),
+
+ heatmap_plot,
+
+ nrow = 1, ncol = 2
+ ) %>%
+ annotate_figure(top = text_grob("Discussion of differential abundance\nresults similarity across methods", size = 16, face = "bold", color = "red2"),
+ bottom = text_grob(paste("NAs in the heatmap were arbitrarily set to", NA_value, "in order not to fail clustering,\nwhile still keeping evident track of where a method did not return a pathway as significant"), size = 10))From these results we find out that effect sizes and the number o ALDEx2 and ANCOM-BC are further away, and that LinDA, despite being considered by authors as very similar to ANCOM-BC, in our results it seems more similar to MaAsLin2, with the added value of being extremely fast.